This is an investigation on the use of Topic Modeling on the course catalog at Florida Polytechnic University.
params$department
## [1] "Data Science and Business Analytics"
library(tidytext)
library(textmineR)
library(broom)
library(tidyr)
library(dplyr)
library(ggplot2)
library(here)
library(stringr)
library(DT)
library(proxy)
library(smacof)
library(ggrepel)
# library(MASS)
library(stringdist)
library(widyr)
library(igraph)
library(ggraph)
ptsize <- 2
legends <- TRUE
dist_lmt <- 3.4
library(readr, quietly = T)
set.seed(543)
source(here("transform_course_data.R"))
data <- read_csv(here("data/courses-list-fpu.csv"))
filter_regex = ""
replace_regex = ""
filtering_string <- regex(paste0("^Week|\\s{2}|^\\s{1}|^Quiz|^Chapter|^Case|^http|Ch.|^Incoterms|Exam|Presentations|www|",
filter_regex),
ignore_case = TRUE)
replace_string <- regex(paste0("^\\d{1}\\. |\\d{2}\\. |^Lab \\d{1}. |^Lab \\d{2}. |^[a-z]. |^\\d{1}.|",
replace_regex),
ignore_case = TRUE)
data <- clean_columns(data)
# This will eventually be its own script
outl_df <- data %>%
mutate(new_col = strsplit(as.character(Course_Description), "[\\\r\\\n\\\t]+"))
main_outl_df <- outl_df %>% tidyr::separate_rows(new_col, sep = "^[0-9].") %>% filter(!grepl("^\\d{1}\\. |\\d{2}\\. ", new_col))
# Getting everything else just in case we need them
side_outl_df <- outl_df %>% tidyr::separate_rows(new_col, sep = "^[0-9].") %>% filter(!grepl(paste0("^\\d{1}\\. |\\d{2}\\. "), new_col))
main_outl_df$new_col <- main_outl_df$new_col %>%
str_replace_all(replace_string, "")
# Separate
side_outl_new <- side_outl_df %>%
filter(!grepl(filtering_string,
new_col, ignore.case = TRUE)) %>%
filter(!is.na(new_col))
side_outl_new$new_col <- side_outl_new$new_col %>% str_replace_all(replace_string, "")
# Joining the two dataframes for the new_col
full_outl <- main_outl_df %>% rbind(side_outl_new)
# Filtering to the department
full_outl <- full_outl %>%
filter(Department_Name == params$department)
# filter(Department_Name == "Computer Science")
# filter(Department_Name == "Data Science and Business Analytics")
# Getting bigrams
terms_bigram <- full_outl %>%
select(c(Course_ID, new_col)) %>%
unnest_tokens("desc_word", new_col, token = "ngrams", n = 2) %>%
separate(desc_word, c("word1", "word2")) %>%
filter(!word1 %in% c(stop_words$word, "research", "scientific", "paper", "guest", "topics", "based", "covers", "current", "toolset", "current", "student", "unknown", "senior", "relevant", "term")) %>%
filter(!grepl("^[0-9]", word1)) %>%
filter(!word2 %in% c(stop_words$word, "include", "information", "sources", "project", "term", "base")) %>%
filter(!grepl("^[0-9]", word2)) %>%
unite(desc_bigram, word1, word2, sep = " ") %>%
filter(!desc_bigram == "NA NA")
bigram_dtm <- terms_bigram %>%
count(Course_ID, desc_bigram, sort = TRUE) %>%
cast_dtm(Course_ID, desc_bigram, n)
# List of course_id matched to course names
course_list <- split(full_outl$Name, full_outl$Course_ID)
The first test is an LDA model with k = 5 using the Gibbs method.
library(topicmodels)
# k = 5 for the number of concentrations
bigram_lda <- LDA(bigram_dtm, k = ifelse(params$department == "Computer Science", 6, 5), method = "Gibbs", control=list(iter = 500, verbose = 25, alpha = 0.2))
## K = 5; V = 1046; M = 92
## Sampling 500 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Gibbs sampling completed!
course_topics <- tidy(bigram_lda, matrix = "beta")
course_topics %>%
datatable()
course_docs <- tidy(bigram_lda, matrix = "gamma")
course_docs %>%
group_by(topic) %>%
slice_max(gamma, n = 5) %>%
ungroup() %>%
arrange(topic, -gamma) %>%
datatable()
# course_top_docs %>%
# mutate(document = reorder_within(document, gamma, topic)) %>%
# ggplot(aes(gamma, document, fill = factor(document))) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~ topic, scales = "free") +
# scale_y_reordered()
The five concentrations are as follows:
- Logistics & Supply Chain Management
- Intelligent Mobility
- Quantitative Economics and Econometrics
- Big Data Analytics
- Health Systems Engineering
The LDA model we have seems to be able to spread the topics pretty well. But there seems to be a shortcoming in its ability to separate one concentration from one another. I believe this is due to the fact that a lot of the DSBA curriculum overlaps in many ways.
course_top_terms <- course_topics %>%
filter(!is.na(term)) %>%
group_by(topic) %>%
slice_max(beta, n = 9) %>%
ungroup() %>%
arrange(topic, -beta)
course_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_y_reordered()
ggsave("img/lda.png", dpi = 300)
There are some distance metrics I would like to try
- Hellinger Distance (In-progress) - Cosine Similarity (Isn’t this done when using MCA/CA?)
- Jaccard Similary (In-progress) - Sorensen-Dice Similarity (In-progress)
dist_euc <- bigram_dtm %>%
tidy() %>%
pairwise_dist(item = document, feature = term, value = count, method = "euclidean")
dist_euc %>%
filter(distance < dist_lmt) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = distance), show.legend = legends) +
geom_node_point(color = "lightblue", size = ptsize) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()+
labs(title = "Distance Plot: Euclidean")
dist_manh <- bigram_dtm %>%
tidy() %>%
pairwise_dist(item = document, feature = term, value = count, method = "manhattan")
dist_manh %>%
filter(distance < 15) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = distance), show.legend = legends) +
geom_node_point(color = "lightblue", size = ptsize) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
labs(title = "Distance Plot: Manhattan")
sim_cos <- bigram_dtm %>%
tidy() %>%
pairwise_similarity(item = document, feature = term, value = count)
sim_cos %>%
mutate(distance = 1 - similarity) %>%
filter(distance < ifelse(params$department == "Computer Science", 0.9, 0.02)) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = distance), show.legend = legends) +
geom_node_point(color = "lightblue", size = ptsize) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
labs(title = "1 - Cosine Similarity Plot: Bigrams")
delta_brw <- bigram_dtm %>%
tidy() %>%
pairwise_delta(item = document, feature = term, value = count, method = "burrows")
delta_brw %>%
filter(delta < 0.1) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = delta), show.legend = legends) +
geom_node_point(color = "lightblue", size = ptsize) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
labs(title = "Burrows Delta")
delta_lnr <- bigram_dtm %>%
tidy() %>%
pairwise_delta(item = document, feature = term, value = count, method = "argamon")
delta_lnr %>%
filter(delta < 0.029) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = delta), show.legend = legends) +
geom_node_point(color = "lightblue", size = ptsize) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
labs(title = "Argamon's Linear Delta")
This is the same as the graphs above, but using the full course descriptions by: - Getting each word individually
- removing the stop words (a, the, and)
- Rejoining all of the descriptions together
- Computing the distance matrices based on the full descriptions
# Doing what I said above
course_full_desc <- full_outl %>%
select(c(Course_ID, new_col)) %>%
unnest_tokens("word", new_col) %>%
filter(!word %in% stop_words$word) %>%
filter(!grepl("[0-9]", word)) %>%
group_by(Course_ID) %>%
summarise(text = str_c(word, collapse = " ")) %>%
ungroup() %>%
filter(!is.na(text))
cos_mat <- stringdistmatrix(course_full_desc$text, course_full_desc$text, useNames = FALSE, method = "cosine") %>%
as.matrix()
colnames(cos_mat) <- course_full_desc$Course_ID
rownames(cos_mat) <- course_full_desc$Course_ID
cos_course <- reshape2::melt(cos_mat)[reshape2::melt(upper.tri(cos_mat))$value,]
colnames(cos_course) <- c("Term1", "Term2", "distance")
cos_course %>%
filter(distance < 0.02) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = distance), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = ptsize) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
labs(title = "1 - Cosine Similarity Plot: Full Desc.")
ggsave("img/cos.png", dpi = 300)
jac_mat <- stringdistmatrix(course_full_desc$text, course_full_desc$text, useNames = FALSE, method = "jaccard") %>%
as.matrix()
colnames(jac_mat) <- course_full_desc$Course_ID
rownames(jac_mat) <- course_full_desc$Course_ID
jac_course <- reshape2::melt(jac_mat)[reshape2::melt(upper.tri(jac_mat))$value,]
colnames(jac_course) <- c("Term1", "Term2", "distance")
jac_course %>%
filter(distance < 0.04) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = distance), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = ptsize) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
labs(title = "1 - Jaccard Similarity Plot: Full Desc.")
ggsave("img/jac.png", dpi = 300)
All MDS implementations are nonmetric (for ordinal data).
library(plotly)
mds_euc <- bigram_dtm %>%
stats::dist(method = "euclidean") %>%
# t() %>%
mds(type = "ordinal")
ggplot() +
geom_point(data = as_tibble(mds_euc$conf), aes(x = D1, y = D2)) +
geom_text(as_tibble(mds_euc$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
theme_minimal() +
labs(title = "MDS with Euclidean Distance Matrix")
mds_man <- bigram_dtm %>%
stats::dist(method = "manhattan") %>%
# t() %>%
mds(type = "ordinal")
ggplot() +
geom_point(data = as_tibble(mds_man$conf), aes(x = D1, y = D2)) +
geom_text(as_tibble(mds_man$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
theme_minimal() +
geom_text_repel() +
labs(title = "MDS with Manhattan Distance Matrix")
library(slam)
cosine_dist_mat <- 1 - crossprod_simple_triplet_matrix(t(bigram_dtm))/(sqrt(col_sums(t(bigram_dtm)^2) %*% t(col_sums(t(bigram_dtm)^2))))
mds_cos <- cosine_dist_mat %>%
# t() %>%
mds(type = "ordinal")
ggplot() +
geom_point(data = as_tibble(mds_cos$conf), aes(x = D1, y = D2)) +
geom_text(as_tibble(mds_cos$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
geom_text_repel() +
theme_minimal() +
labs(title = "MDS with 1 - Cosine Similarity")
Still trying to figure this one out
# mds_jac <- bigram_dtm %>%
# dist(method = "Jaccard", pairwise = TRUE) %>%
# # t() %>%
# mds(type = "ordinal")
#
#
# ggplot() +
# geom_point(data = as_tibble(mds_jac$conf), aes(x = D1, y = D2)) +
# geom_text(as_tibble(mds_jac$conf), mapping = aes(x = -D1,y= -D2), label = rownames(bigram_dtm)) +
# geom_text_repel() +
# theme_minimal() +
# labs(title = "MDS with 1 - Jaccard Similarity")
mds_cos_mat <- cos_mat %>%
mds(type = "ordinal")
ggplot() +
geom_point(data = as_tibble(mds_cos_mat$conf), aes(x = D1, y = D2, colour = D2 > 0.5)) +
scale_colour_manual(values = setNames(c('#532d8e','grey'),c(T, F))) +
scale_alpha_manual(values = c(1, 0.01)) +
geom_text(as_tibble(mds_cos_mat$conf), mapping = aes(
x = -D1, y = -D2, color = D2 < -0.5, label = paste(rownames(mds_cos_mat$conf))), alpha = .7) +
geom_text_repel() +
theme_minimal() +
labs(title = "MDS with 1 - Cosine Similarity") +
theme(legend.position = "")
ggsave("img/cos_mds.png", dpi = 300)
mds_jac_mat <- jac_mat %>%
mds(type = "ordinal")
ggplot() +
geom_point(data = as_tibble(mds_jac_mat$conf), aes(x = D1, y = D2, colour = D2 > 0.5 | D2 < -0.55)) +
scale_colour_manual(values = setNames(c('#532d8e','grey'),c(T, F))) +
scale_alpha_manual(values = c(1, 0.01)) +
geom_text(as_tibble(mds_jac_mat$conf), mapping = aes(
x = -D1, y = -D2, color = D2 < -0.5 | D2 > 0.55, label = paste(rownames(mds_jac_mat$conf))), alpha = .7) +
geom_text_repel() +
theme_minimal() +
labs(title = "MDS with 1 - Jaccard Similarity") +
theme(legend.position = "")
ggsave("img/jac_mds.png", dpi = 300)